Inês Silva 202008362, Maria Miguel , Renatha Silva
Project 1 - Machine Learning¶
Task 1: Investigate method assumptions¶
To better visualize the borders created by each method and evaluate the results of its applicantion for each dataset, we created the functions 'evaluate_approaches' and 'plot_approaches'.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import colors
import seaborn as sns
from matplotlib import colormaps
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import PolynomialFeatures
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
cmap = mpl.colors.ListedColormap(['blue', 'orange'])
def plot_classifier_boundary(model, X, ax):
""" Plots the decision boundary for any sklearn classifier for a 2 class problem."""
cmap_light = mpl.colors.ListedColormap(['lightsteelblue', 'peachpuff'])
# generate a grid with step h
x_min, x_max = X[:, 0].min()-.2, X[:, 0].max()+.2
y_min, y_max = X[:, 1].min()-.2, X[:, 1].max()+.2
xx, yy = np.meshgrid(
np.arange(x_min, x_max, 0.05),
np.arange(y_min, y_max, 0.05)
)
# the method ravel flattens xx and yy
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cmap_light)
ax.set_xlim((x_min,x_max))
ax.set_ylim((y_min,y_max))
def evaluate_approaches(X, y, cv, verbose=True):
"""Evaluate Logitic Regression, LDA, QDA, and Naive Bayes using cross validation."""
logr = LogisticRegression().fit(X, y)
logr_acc = cross_val_score(logr, X, y, cv=10, scoring='accuracy')[0]
lda = LinearDiscriminantAnalysis().fit(X, y)
lda_acc = cross_val_score(lda, X, y, cv=10, scoring='accuracy')[0]
qda = QuadraticDiscriminantAnalysis().fit(X, y)
qda_acc = cross_val_score(qda, X, y, cv=10, scoring='accuracy')[0]
svm_rbf = SVC(kernel='rbf').fit(X, y)
svm_rbf_acc = cross_val_score(svm_rbf, X, y, cv=10, scoring='accuracy')[0]
svm_linear = SVC(kernel='linear').fit(X, y)
svm_linear_acc = cross_val_score(svm_linear, X, y, cv=10, scoring='accuracy')[0]
dt = DecisionTreeClassifier().fit(X, y)
dt_acc = cross_val_score(dt, X, y, cv=10, scoring='accuracy')[0]
dt_depth = DecisionTreeClassifier().fit(X, y)
dt_depth_acc = cross_val_score(dt_depth, X, y, cv=10, scoring='accuracy')[0]
if verbose:
print(f'LogReg accuracy: {logr_acc:.2f}')
print(f'LDA accuracy: {lda_acc:.2f}')
print(f'QDA accuracy: {qda_acc:.2f}')
print(f'SVM with rbf kernel accuracy: {svm_rbf_acc:.2f}')
print(f'SVM with linear kernel accuracy: {svm_linear_acc:.2f}')
print(f'DT accuracy: {dt_acc:.2f}')
print(f'DT with max depth=2 accuracy: {dt_depth_acc:.2f}')
return {'logr': logr_acc, 'lda': lda_acc, 'qda': qda_acc, 'dt' : dt_acc, 'svm with rbf' : svm_rbf_acc, 'svm with rbf' : svm_linear_acc, 'dt_depth2': dt_depth_acc}
def plot_approaches(X, y):
_, axs = plt.subplots(3, 3, figsize=(10, 10))
ax = axs[0, 0]
logr = LogisticRegression().fit(X,y)
plot_classifier_boundary(logr, X, ax)
ax.scatter(X[:,0], X[:,1], color=cmap(y), alpha=0.7)
ax.set_title('Logistic regression')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax = axs[0, 1]
lda = LinearDiscriminantAnalysis().fit(X,y)
plot_classifier_boundary(lda, X, ax)
ax.scatter(X[:,0], X[:,1], color=cmap(y), alpha=0.7)
ax.set_title('LDA')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax = axs[0, 2]
qda = QuadraticDiscriminantAnalysis().fit(X,y)
plot_classifier_boundary(qda, X, ax)
ax.scatter(X[:,0], X[:,1], color=cmap(y), alpha=0.7)
ax.set_title('QDA')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax = axs[1, 0]
svm_rbf = SVC(kernel='rbf').fit(X,y)
plot_classifier_boundary(svm_rbf, X, ax)
ax.scatter(X[:,0], X[:,1], color=cmap(y), alpha=0.7)
ax.set_title('SVM_RBF')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax = axs[1, 1]
svm_linear = SVC(kernel='linear').fit(X,y)
plot_classifier_boundary(svm_linear, X, ax)
ax.scatter(X[:,0], X[:,1], color=cmap(y), alpha=0.7)
ax.set_title('SVM_Linear')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax = axs[1, 2]
dt = DecisionTreeClassifier().fit(X,y)
plot_classifier_boundary(dt, X, ax)
ax.scatter(X[:,0], X[:,1], color=cmap(y), alpha=0.7)
ax.set_title('DT')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax = axs[2, 0]
dt_depth2 = DecisionTreeClassifier(max_depth=2).fit(X,y)
plot_classifier_boundary(dt_depth2, X, ax)
ax.scatter(X[:,0], X[:,1], color=cmap(y), alpha=0.7)
ax.set_title('DT_Depth2')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
fig.tight_layout(pad=5.0)
plt.show()
The objective in this section was to find for each of the methods listed below, a dataset where the respective assumptions are met and assumptions of the other methods are not met. More particularly, a dataset where one method is hard to beat using cross validation. For all generated datasets was used 2 predictors and 2 classes, in order to better compare the differences.
Dataset for Logistic Regression¶
Assumptions:
- Non-linear Separability
- No Multicollinearity
- Noise
- No Outliers
- Balanced Classes
def complex_logreg_data_generator(N, r, e_b, e_o):
# Generate feature values linearly distributed in the interval [-5,5] for blue class
x_b = np.linspace(-5, 5, N*r)
# Generate feature values linearly distributed in the interval [-5,5] for orange class
x_o = np.linspace(-5, 5, N*r)
# Generate exponential noise and ADD to the linearly distributed values for blue class - BREAKS THE GAUSSIAN
x_exp_blue = x_b + np.random.exponential(e_b, N) # ADD MORE DISPERSION
# Stack the coordinates x and x_exp_blue, adding the noise, to form the points of the blue class.
X_blue = np.vstack([x_b, x_exp_blue]).T
# Create labels for the points of the blue class, assigning the value 0.
y_blue = np.zeros(len(X_blue))
# Generate exponential noise and SUBTRACT from the linearly distributed values for orange class - BREAKS THE GAUSSIAN
x_exp_orange = x_o - np.random.exponential(e_o, N*r)
# Stack the coordinates x1 and x2 to form the points of the orange class.
X_orange = np.vstack([x_o, x_exp_orange]).T
# Create labels for the points of the orange class, assigning the value 1.
y_orange = np.ones(len(X_orange))
# Concatenate the blue and orange class data sets to form the complete data set.
X = np.concatenate([X_blue, X_orange])
# Concatenate the labels of the blue and orange classes to form the complete labels.
y = np.concatenate([y_blue, y_orange])
return X, y
# Generate dataset
X, y = complex_logreg_data_generator(200, 1, 3, 1)
# Visualize dataset
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Generated Dataset')
plt.show()
The similarity in performance between Logistic Regression and SVM with a linear kernel suggests that the dataset's characteristics are conducive to linear separation, and both methods effectively leverage this property to achieve their maximum accuracy.
evaluate_approaches(X, y, 10)
plot_approaches(X, y)
LogReg accuracy: 0.90 LDA accuracy: 0.90 QDA accuracy: 0.88 SVM with rbf kernel accuracy: 0.88 SVM with linear kernel accuracy: 0.93 DT accuracy: 0.88 DT with max depth=2 accuracy: 0.88
Linear Discriminant Analysis (LDA)¶
Assumptions:
- Linearity
- Multivariate Normal Distribution within each class
- Equal covariance matrices of the predictor variables for all classes
- Number of observations in each class is large enough relative to the number of predictor variables
def lda_dataset_generator(n_samples):
# Set random seed for reproducibility
np.random.seed(5)
# Generate blue class data with normal distribution
blue_center = [-1, -1]
X_blue = np.random.normal(loc=blue_center, scale=1, size=(n_samples // 2, 2))
y_blue = np.zeros(n_samples // 2)
# Generate orange class data with normal distribution
orange_center = [1, 1]
X_orange = np.random.normal(loc=orange_center, scale=1, size=(n_samples // 2, 2))
y_orange = np.ones(n_samples // 2)
# Combine data from both classes
X = np.concatenate([X_blue, X_orange])
y = np.concatenate([y_blue, y_orange])
# LDA performs best when the number of observations in each class is large enough
# relative to the number of predictor variables
assert (n_samples // 2) > 2, "Number of observations per class should be at least 2 for LDA"
return X, y
# Generate dataset
X, y = lda_dataset_generator(100)
# Visualize dataset
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('LDA Dataset with Normal Distribution')
plt.show()
evaluate_approaches(X,y,10)
plot_approaches(X,y)
LogReg accuracy: 0.90 LDA accuracy: 1.00 QDA accuracy: 1.00 NB accuracy: 0.90 SVM with rbf kernel accuracy: 1.00 SVM with linear kernel accuracy: 0.90 DT accuracy: 0.70 DT with max depth=2 accuracy: 0.80
LDA still was not able to overperform more complex methods like QDA and SVM with rbf kernel, since this methods have more tuning parameters, such as the regularization parameter in SVM and the prior probabilities in QDA. These additional parameters provide more flexibility in fine-tuning the models to the specific characteristics of the dataset, which is potentially leading to a improved performance.
Dataset for Quadratic Discriptive Analysys¶
Assumptions:
- Non-Linearity
- Normality
- Distinct Classes
- Different Covariances
- Independence
- Correlation Between Values
- Imbalanced Class Distribution
- Feature Independence
from sklearn.neighbors import KNeighborsClassifier as KNN
def qda_data_generator(n1, n2):
#two predictors: x1 and x2 with different covariances, and some correlation between values and non-linear
x1 = np.random.multivariate_normal([1, 1], [[1, -0.3], [-0.7, 2]], n1)
x2 = np.random.multivariate_normal([-1, -1], [[0.5, 0.7], [0.7, 1]], n2)
#y1 has only 0's, in the same quantity as the number of x1 values.
#y2 has only 1's, in the same quantity as the number of x2 values.
y1 = np.zeros(len(x1))
y2 = np.ones(len(x2))
#When x1 and x2 are concatenated into X, it creates several datapoints with a pair of predictors.
X = np.concatenate([x1, x2])
#When y1 and y2 are concatenated into y, it creates a label vector with two distinct classes (0 and 1).
y = np.concatenate([y1, y2])
return X, y
X, y = qda_data_generator(250, 100)
cmap = mpl.colors.ListedColormap(['blue', 'red'])
evaluate_approaches(X,y,10)
plot_approaches(X,y)
LogReg accuracy: 0.94 LDA accuracy: 0.94 QDA accuracy: 1.00 NB accuracy: 0.94 SVM with rbf kernel accuracy: 1.00 SVM with linear kernel accuracy: 0.94 DT accuracy: 0.97 DT with max depth=2 accuracy: 0.97
Both QDA and SVM with RBF kernel excel in capturing the intricate structure of the data, making them robust choices for classification tasks where linear methods may struggle to discern complex relationships.
Dataset for SVM with linear kernel¶
Assumptions:
- Linear Separability
- Linear Relationships
- Feature Independence
- Homogeneous Data
- Balanced Classes
- Gaussian Distribution
- Non-linear Noise
- Outliers
def svm_l_data_generator(n_blue, n_orange, noise_factor=0.1, outlier_factor=0.1):
# Generate blue class data
blue_center = [1, 1]
X_blue = np.random.normal(loc=blue_center, scale=1, size=(n_blue, 2))
y_blue = np.zeros(n_blue)
# Generate orange class data
orange_center = [-1, -1]
X_orange = np.random.normal(loc=orange_center, scale=1, size=(n_orange, 2))
y_orange = np.ones(n_orange)
# Combine data from both classes
X = np.concatenate([X_blue, X_orange])
y = np.concatenate([y_blue, y_orange])
# Add non-linear noise
num_noise = int(noise_factor * len(X))
X_noise = np.random.uniform(low=-0.5, high=0.5, size=(num_noise, 2)) # Uniform noise within a smaller range
y_noise = np.random.choice([0, 1], size=num_noise) # Randomly assign noise labels
X = np.concatenate([X, X_noise])
y = np.concatenate([y, y_noise])
# Add outliers
num_outliers = int(outlier_factor * len(X))
X_outliers = np.random.uniform(low=-2, high=2, size=(num_outliers, 2)) # Uniform outliers within a smaller range
y_outliers = np.random.choice([0, 1], size=num_outliers) # Randomly assign outlier labels
X = np.concatenate([X, X_outliers])
y = np.concatenate([y, y_outliers])
return X, y
# Generate training and test datasets
X, y = svm_l_data_generator(500, 200)
# Plot the dataset
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='coolwarm', alpha=0.5)
plt.xlabel('x1')
plt.ylabel('x2')
plt.grid()
plt.title('Dataset with Reduced Outliers')
plt.show()
evaluate_approaches(X,y,10)
plot_approaches(X,y)
LogReg accuracy: 1.00 LDA accuracy: 1.00 QDA accuracy: 0.99 NB accuracy: 1.00 SVM with rbf kernel accuracy: 0.98 SVM with linear kernel accuracy: 1.00 DT accuracy: 0.92 DT with max depth=2 accuracy: 0.92
The maximum accuracy achieved by SVM with a linear kernel, Logistic Regression, and LDA indicates that the dataset exhibits clear linear separability, allowing these methods to accurately classify each instance, with overperfomance.
from sklearn.datasets import make_circles
def svm_rbf_data_generator(n):
#two predictors: x1 and x2 with different covariances, and some correlation between values and non-linear
X, y = make_circles(n, factor=.5, noise=.3, random_state=5) #with more noise svm rbf outperforms the others
return X, y
X, y = svm_rbf_data_generator(200)
# Plot the dataset
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='coolwarm', alpha=0.5)
plt.xlabel('x1')
plt.ylabel('x2')
plt.grid()
plt.title('Dataset for SVM with rbf kernel')
plt.show()
evaluate_approaches(X,y,10)
plot_approaches(X,y)
LogReg accuracy: 0.60 LDA accuracy: 0.60 QDA accuracy: 0.70 NB accuracy: 0.70 SVM with rbf kernel accuracy: 0.75 SVM with linear kernel accuracy: 0.45 DT accuracy: 0.65 DT with max depth=2 accuracy: 0.70
Dataset for Decision Trees¶
Assumptions:
- Noise
- Non-normal distribution
- Non-linear boundary
- Different covariances
- Dependency between variables
def decision_tree(n_blue, n_orange):
# Generate blue class data with two distinct clusters
X_blue_1 = np.random.normal(loc=[-1, 1], scale=2.5, size=(n_blue//2, 2))
X_blue_2 = np.random.normal(loc=[-1, 1], scale=1.5, size=(n_blue//2, 2))
X_blue = np.concatenate([X_blue_1, X_blue_2])
y_blue = np.zeros(n_blue)
# Generate orange class data with two distinct clusters
X_orange_1 = np.random.exponential(scale=2.5, size=(n_orange//2, 2))
X_orange_2 = np.random.exponential(scale=1.5, size=(n_orange//2, 2))
X_orange = np.concatenate([X_orange_1, X_orange_2])
y_orange = np.ones(n_orange)
# Combine data from both classes
X = np.concatenate([X_blue, X_orange])
y = np.concatenate([y_blue, y_orange])
return X, y
# Generate dataset
X, y = decision_tree(500, 200)
# Visualize dataset
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Decision Tree Dataset with Distinct Clusters')
plt.show()
evaluate_approaches(X,y,10)
plot_approaches(X,y)
LogReg accuracy: 0.80 LDA accuracy: 0.80 QDA accuracy: 0.80 NB accuracy: 0.80 SVM with rbf kernel accuracy: 0.86 SVM with linear kernel accuracy: 0.80 DT accuracy: 0.90 DT with max depth=2 accuracy: 0.89
Dataset for decision trees with max_depth of 2¶
Assumptions:
- Noise
- Non-normal distribution
- Non-linear boundary
- Different covariances
- Feature Independence
- Equal Variability (scale parameter, in order to reduce complexity a bit)
def decision_tree_depth_2(n_blue, n_orange):
# Generate blue class data with two distinct clusters
X_blue_1 = np.random.normal(loc=[1, -1], scale=1.5, size=(n_blue//2, 2))
X_blue_2 = np.random.normal(loc=[-1, 1], scale=1.5, size=(n_blue//2, 2))
X_blue = np.concatenate([X_blue_1, X_blue_2])
y_blue = np.zeros(n_blue)
# Generate orange class data with two distinct clusters
X_orange_1 = np.random.exponential(scale=1.5, size=(n_orange//2, 2))
X_orange_2 = np.random.exponential(scale=1.5, size=(n_orange//2, 2))
X_orange = np.concatenate([X_orange_1, X_orange_2])
y_orange = np.ones(n_orange)
# Combine data from both classes
X = np.concatenate([X_blue, X_orange])
y = np.concatenate([y_blue, y_orange])
return X, y
# Generate dataset
X, y = decision_tree_depth_2(500, 200)
# Visualize dataset
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Modified Decision Tree Dataset with Distinct Clusters')
plt.show()
evaluate_approaches(X,y,10)
plot_approaches(X,y)
LogReg accuracy: 0.80 LDA accuracy: 0.80 QDA accuracy: 0.81 NB accuracy: 0.77 SVM with rbf kernel accuracy: 0.87 SVM with linear kernel accuracy: 0.80 DT accuracy: 0.84 DT with max depth=2 accuracy: 0.86
Task 2: Bias, Variance and Model Capacity¶
Post-Pruning or ‘backward pruning’ is a technique that eliminates branches after computing the model to reduce its complexity and variance. This allows the decision tree to grow to its full depth, and only then, removes branches to prevent the model from overfitting. In Post-Pruning, non-significant branches of the model are removed using the Cost Complexity Pruning (CCP) technique. Alpha is known as the complexity parameter.
Greater values of ccp_alpha increase the number of nodes pruned. As alpha increases, more of the tree is pruned, which increases the total impurity of its leaves. Higher values of this parameter enforce stronger regularization, which prevents the tree from overfitting to the training data by penalizing the growth of the tree.
Effect of Noise on Optimal ccp_alpha Value:¶
def decision_tree(n_blue, n_orange):
# Generate blue class data with two distinct clusters
X_blue_1 = np.random.normal(loc=[-1, 1], scale=2.5, size=(n_blue//2, 2))
X_blue_2 = np.random.normal(loc=[-1, 1], scale=1.5, size=(n_blue//2, 2))
X_blue = np.concatenate([X_blue_1, X_blue_2])
y_blue = np.zeros(n_blue)
# Generate orange class data with two distinct clusters
X_orange_1 = np.random.exponential(scale=2.5, size=(n_orange//2, 2))
X_orange_2 = np.random.exponential(scale=1.5, size=(n_orange//2, 2))
X_orange = np.concatenate([X_orange_1, X_orange_2])
y_orange = np.ones(n_orange)
# Combine data from both classes
X = np.concatenate([X_blue, X_orange])
y = np.concatenate([y_blue, y_orange])
return X, y
# Generate dataset
X, y = decision_tree(500, 200)
noise_levels = [0, 0.25, 0.5, 0.75, 1]
accuracies = []
ccp_alphas = np.linspace(0, 0.02, 30)
for noise_level in noise_levels:
X = X + np.random.normal(scale=noise_level, size=X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
acc = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
acc.append(accuracy)
accuracies.append(acc)
plt.figure(figsize=(10, 6))
for i, noise_level in enumerate(noise_levels):
plt.plot(ccp_alphas, accuracies[i], label=f"Noise Level: {noise_level}")
plt.xlabel('ccp_alpha')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. ccp_alpha for Different Noise Levels')
plt.legend()
plt.grid(True)
plt.show()
Without the presence of noise, ccp_alpha is typically >= 0.0050, balancing complexity and accuracy to prevent overfitting. With a noise value of 0.25, best accuracy requires ccp_alpha >= 0.0125, allowing flexibility to capture patterns amidst moderate noise. As noise increases, optimal ccp_alpha narrows: 0.0025 - 0.0075 for noise=0.50, and <=0.025 for noise=0.75. At maximum noise (1), optimal alpha is <=0.0050, to avoid underfitting.
Bias and Variance¶
Bias and Variance are important to have a better understanding of the model capacity and its regularization abitily. The bias error represents the distance between the average of the predictions made by the model and the actual value. The variance error refers to the average of the distances between the values predicted by the model and the actual value.
def error_decomposition(ccp_alpha):
X_test, y_test = decision_tree(500,200)
predictions = []
for _ in range(2):
X_train, y_train = decision_tree(500,200)
model = DecisionTreeClassifier(ccp_alpha=ccp_alpha).fit(X_train, y_train)
predictions.append(model.predict(X_test))
predictions = np.array(predictions)
avg_pred = np.mean(predictions, axis=0)
squared_bias = np.mean((avg_pred - y_test)**2)
variance = np.var(predictions, axis=0).mean()
total_err= np.mean((predictions- y_test)**2)
return squared_bias, variance, total_err
results = {"ccp_alpha": [], "bias": [], "variance": [], "total_err": []}
for ccp_alpha in ccp_alphas:
bias, variance, total_err = error_decomposition(ccp_alpha)
results["ccp_alpha"].append(ccp_alpha)
results["bias"].append(bias)
results["variance"].append(variance)
results["total_err"].append(total_err)
plt.plot(results["ccp_alpha"], results["bias"], label="Bias")
plt.plot(results["ccp_alpha"], results["variance"], label="Variance")
plt.plot(results["ccp_alpha"], results["total_err"], label="Total Error")
plt.xlabel("ccp_alpha")
plt.legend()
plt.grid()
plt.show()
Overall, this model applied to the decision tree's dataset, has a high bias and a low variance error. There is a trend towards a slight increase in bias and a decrease in variance, the higher the ccp_alpha.
High bias and low variance can cause a model to miss relevant relationships between the features and the target. This plot indicates that, for a high value of alpha, the model becomes less expressive and too generalizated, potentially generating underfitting.
Knowing this, it is ideal to have a balance between the variance and the bias in the model. In this particular case, it is recomended to opt for a lower ccp_alpha value, ranging 0 and 0.05, to avoid losing relevant information.
3. Interpreting the results in terms of model capacity, bias error, variance error, and total error¶
Model capacity refers to the flexibility or complexity of the decision tree model to capture patterns in the data. When the ccp_alpha value is high (equal to or higher than 0.0075), the decision tree is pruned more aggressively, resulting in lower model capacity. It also leads to higher bias error, as the decision tree is more pruned and less capable of capturing complex patterns. Higher levels of noise also require lower ccp_alpha values to mitigate variance error and prevent overfitting.
As the noise level increases, the optimal ccp_alpha value decreases in general, indicating the need for lower model capacity to combat the influence of noise and prevent overfitting. A slightly lower ccp_alpha value (e.g., around 0.005) is more effective in controlling bias error, allowing for more flexibility in the decision tree's structure to capture underlying patterns without being overly influenced by noise.
Task 3: Compare Bagging, RandomForest and AbaBoost¶
from sklearn.ensemble import AdaBoostClassifier as AdaC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import BaggingClassifier as BagC
from sklearn.model_selection import cross_val_score
X, y = decision_tree(500, 200)
An OOB prediction can be obtained in this way for each of the n observations, from which the overall OOB MSE (for a regression problem) or classification error (for a classification problem) can be computed.
The resulting OOB error is a valid estimate of the test error for the bagged model, since the response for each observation is predicted using only the trees that were not fit using that observation.
Learning curves of Random Forest and Bagging along the number of trees and using OOB¶
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
import numpy as np
import matplotlib.pyplot as plt
# Define a faixa de número de árvores/estimadores
ntree = np.array([1, 2, 3, 5, 10, 20, 30, 40, 50, 100, 150, 200, 300, 400, 500])
# Função para calcular o erro OOB para RandomForest
def calculate_oob_errors_rf(X, y, n_estimators):
oob_errors = []
for n_estimators in ntree:
# Cria o modelo RandomForest com o número atual de árvores
model = RandomForestClassifier(n_estimators=n_estimators, oob_score=True, random_state=5)
# Treina o modelo
model.fit(X, y)
# Obtém o erro OOB e armazena na lista
oob_errors.append(1 - model.oob_score_)
return oob_errors
# Função para calcular o erro OOB para Bagging
def calculate_oob_errors_bagging(X, y, n_estimators):
oob_errors = []
for n_estimators in ntree:
# Cria o modelo Bagging com o número atual de estimadores
model = BaggingClassifier(n_estimators=n_estimators, oob_score=True, random_state=5)
# Treina o modelo
model.fit(X, y)
# Obtém o erro OOB e armazena na lista
oob_errors.append(1 - model.oob_score_)
return oob_errors
# Calcula os erros OOB para RandomForest e Bagging
oob_errors_rf = calculate_oob_errors_rf(X, y, ntree)
oob_errors_bagging = calculate_oob_errors_bagging(X, y, ntree)
# Plotagem dos resultados lado a lado
fig, axs = plt.subplots(1, 2, figsize=(12,4))
# RandomForest
axs[0].plot(ntree, oob_errors_rf, color='green')
axs[0].set_title('Random Forest: Out-of-Bag Error')
axs[0].set_xlabel('Number of Trees')
axs[0].set_ylabel('Out-of-Bag Error')
axs[0].grid(True)
# Bagging
axs[1].plot(ntree, oob_errors_bagging, color='blue')
axs[1].set_title('Bagging: Out-of-Bag Error')
axs[1].set_xlabel('Number of Estimators')
axs[1].set_ylabel('Out-of-Bag Error')
axs[1].grid(True)
plt.tight_layout()
plt.show()
The Out-of-Bag (OOB) error in Random Forest and Bagging models generated with different tree sizes starts to stabilize at around 100 trees. Therefore, the optimal number of tree sizes used to generate the models is 150 trees for both cases. Besides being sufficient to reduce OOB errors, using only 150 trees in both Random Forest and Bagging can also speed up the computation time to generate the models.
from sklearn.ensemble import RandomForestClassifier
import numpy as np
def calculate_oob_improvements_RF(X, y, n_estimators):
# Inicializar o modelo RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=n_estimators, oob_score=True)
# Treinar o modelo
model_rf.fit(X, y)
# Calcular o OOB score para a primeira iteração
previous_oob_score = model_rf.oob_score_
# Lista para armazenar as melhorias OOB
oob_improvements = []
# Loop sobre as iterações
for i in range(2, n_estimators + 1): # Começamos a partir da segunda iteração
# Treinar o modelo com o número atual de estimadores
model_rf.set_params(n_estimators=i)
model_rf.fit(X, y)
# Calcular o OOB score para a iteração atual
current_oob_score = model_rf.oob_score_
# Calcular a melhoria OOB em relação à iteração anterior
oob_improvement = previous_oob_score - current_oob_score
# Adicionar a melhoria OOB à lista
oob_improvements.append(oob_improvement)
# Atualizar o OOB score anterior para a próxima iteração
previous_oob_score = current_oob_score
return oob_improvements
from sklearn.ensemble import BaggingClassifier
import numpy as np
def calculate_oob_improvements_BC(X, y, n_estimators):
# Inicializar o modelo BaggingClassifier
model_bagging = BaggingClassifier(n_estimators=n_estimators, oob_score=True)
# Treinar o modelo
model_bagging.fit(X, y)
# Calcular o OOB score para a primeira iteração
previous_oob_score = model_bagging.oob_score_
# Lista para armazenar as melhorias OOB
oob_improvements = []
# Loop sobre as iterações
for i in range(2, n_estimators + 1): # Começamos a partir da segunda iteração
# Treinar o modelo com o número atual de estimadores
model_bagging.set_params(n_estimators=i)
model_bagging.fit(X, y)
# Calcular o OOB score para a iteração atual
current_oob_score = model_bagging.oob_score_
# Calcular a melhoria OOB em relação à iteração anterior
oob_improvement = previous_oob_score - current_oob_score
# Adicionar a melhoria OOB à lista
oob_improvements.append(oob_improvement)
# Atualizar o OOB score anterior para a próxima iteração
previous_oob_score = current_oob_score
return oob_improvements
The OOB improvements for Gradient Boosting, Random Forest and Bagging¶
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt
n_estimators = 200
# Gradient Boosting
model_gbc = GBC(n_estimators=n_estimators, learning_rate=0.1, max_depth=3, subsample=0.9).fit(X, y)
# Bagging
bag_oob_improvement = calculate_oob_improvements_BC(X, y, n_estimators)
# RandomForest
rf_oob_improvement = calculate_oob_improvements_RF(X, y, n_estimators)
# Plotagem dos resultados lado a lado
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
# Gradient Boosting
axs[0].plot(np.arange(1, n_estimators+1), model_gbc.oob_improvement_)
axs[0].set_title('Gradient Boosting: Out-of-Bag Improvement')
axs[0].set_xlabel('Number of Estimators')
axs[0].set_ylabel('OOB Improvement')
# Bagging
axs[1].plot(np.arange(1, n_estimators), bag_oob_improvement)
axs[1].set_title('Bagging: Out-of-Bag Improvement')
axs[1].set_xlabel('Number of Estimators')
axs[1].set_ylabel('OOB Improvement')
# RandomForest
axs[2].plot(np.arange(1, n_estimators), rf_oob_improvement)
axs[2].set_title('Random Forest: Out-of-Bag Improvement')
axs[2].set_xlabel('Number of Estimators')
axs[2].set_ylabel('OOB Improvement')
plt.tight_layout()
plt.show()
Learning curves of AdaBoost, Gradient Boosting, Random Forest and Bagging using cross validation¶
nfolds = 5 #cross_validation
RFfeat = int(np.sqrt(X.shape[1])) # Número de características a serem consideradas em cada divisão para RandomForest
# Ele calcula a raiz quadrada do número de características em X e arredonda para o inteiro
ntree = np.array([1, 2, 3, 5, 10, 20, 30, 40, 50, 100, 200, 300]) # Número de árvores a serem testadas
# Funções que retornam a avaliação de cada um dos métodos de ensemble: erro médio de classificação
def score_gbc_class(m): # Boosting
model = GBC(n_estimators=m, learning_rate=0.1, max_depth=3)
return np.mean(1-cross_val_score(model, X, y, cv=nfolds))
def score_ada_class(m): # AdaBoost
model = AdaC(n_estimators=m)
return np.mean(1-cross_val_score(model, X, y, cv=nfolds))
def score_rf_class(nt): # Random Forest
model = RFC(n_estimators=nt, max_features=RFfeat)
return np.mean(1-cross_val_score(model, X, y, cv=nfolds))
def score_bag_class(nt): # Bagging
model = BagC(n_estimators=nt)
return np.mean(1-cross_val_score(model, X, y, cv=nfolds))
# Calcula o erro de classificação para cada método
error_rf_class = list(map(score_rf_class, ntree))
error_bag_class = list(map(score_bag_class, ntree))
error_ada_class = list(map(score_ada_class, ntree))
error_gbc_class = list(map(score_gbc_class, ntree))
# Plotagem dos resultados
plt.figure(figsize=(10, 6))
plt.plot(ntree, error_rf_class, color='green', label='Random Forest')
plt.plot(ntree, error_bag_class, color='orange', label='Bagging')
plt.plot(ntree, error_ada_class, color='red', label='AdaBoost')
plt.plot(ntree, error_gbc_class, color='blue', label='GradientBoost')
plt.xlabel('Number of Trees')
plt.ylabel('Classification Error')
plt.title('Learning Curve for Bagging, RandomForest, AdaBoost and GradientBoost')
plt.legend()
plt.grid(True)
plt.show()
When the number of base (B) models in Bagging and Random Forest is sufficiently large, the out-of-bag (OOB) error tends to be virtually equivalent to the error obtained from k-fold cross-validation. In fact, for large B, the OOB error can sometimes even be better than the error obtained from k-fold cross-validation. This phenomenon occurs because, with a large number of base models, the OOB samples cover a substantial portion of the dataset, making the OOB error estimation more accurate.
The OOB approach for estimating the test error is particularly convenient when performing bagging on large data sets for which cross-validation would be computationally onerous.
Similarly to bagging, the equivalence or superiority of the OOB error to k-fold cross-validation error for Random Forest depends on factors like dataset size, model complexity, and problem characteristics. In practice, Random Forest often provides robust and reliable performance estimation, especially for large and diverse datasets.
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
# Defina os números de árvores para RandomForest e Bagging
num_trees = [1, 2, 3, 5, 10, 20, 30, 40, 50, 100, 200, 300, 500]
# Listas para armazenar os resultados do OOB Score e do CV Score para RandomForest e Bagging
oob_rf_scores = []
cv_rf_scores = []
oob_bagging_scores = []
cv_bagging_scores = []
# Loop sobre o número de árvores
for n_trees in num_trees:
# RandomForest
rf_model = RandomForestClassifier(n_estimators=n_trees, oob_score=True, random_state=5)
rf_model.fit(X, y)
oob_rf_score = rf_model.oob_score_
cv_rf_score = np.mean(cross_val_score(rf_model, X, y, cv=5))
oob_rf_scores.append(oob_rf_score)
cv_rf_scores.append(cv_rf_score)
# Bagging
bagging_model = BaggingClassifier(n_estimators=n_trees, oob_score=True, random_state=)5
bagging_model.fit(X, y)
oob_bagging_score = bagging_model.oob_score_
cv_bagging_score = np.mean(cross_val_score(bagging_model, X, y, cv=5))
oob_bagging_scores.append(oob_bagging_score)
cv_bagging_scores.append(cv_bagging_score)
n_estimators = [1, 2, 3, 5, 10, 20, 30, 40, 50, 100, 200, 300, 500]
# Plotar os resultados
plt.figure(figsize=(12,4))
plt.subplot(1, 2, 1)
plt.plot(n_estimators, oob_rf_scores, label='OOB Score (RandomForest)')
plt.plot(n_estimators, cv_rf_scores, label='CV Score (RandomForest)')
plt.xlabel('Number of Estimators')
plt.ylabel('Accuracy')
plt.title('RandomForestClassifier')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(n_estimators, oob_bagging_scores, label='OOB Score (Bagging)')
plt.plot(n_estimators, cv_bagging_scores, label='CV Score (Bagging)')
plt.xlabel('Number of Estimators')
plt.ylabel('Accuracy')
plt.title('BaggingClassifier')
plt.legend()
plt.tight_layout()
plt.show()
Besides that, OOB validation is much faster than k-fold cross-validation
At 20 folds, our k-fold estimate of the MSE has comparable variation to our OOB estimate — while taking 20 times as long to compute. In general, you can think of OOB validation as requiring only “1x” computation, while k-fold cross-validation requires “kx”. In other words, it takes k times as long or k times as much compute. That can mean a huge difference in compute time.
References:¶
[1] https://towardsdatascience.com/3-techniques-to-avoid-overfitting-of-decision-trees-1e7d3d985a09
[2] https://www.kdnuggets.com/2022/09/decision-tree-pruning-hows-whys.html
[5] https://medium.com/data-science-at-microsoft/out-of-bag-validation-for-random-forests-378f2b292560
[6] https://www.randox.com/mu-vs-te/
[7] https://www.geeksforgeeks.org/bias-vs-variance-in-machine-learning/
[8] https://www.statology.org/linear-regression-assumptions/